Objective of the study are:
Identify the top-selling region in the EU Superstore dataset.
Identify top-selling product categories driving overall revenue.
Examine key subcategories driving sales to gain insights into customer-favored product types.
Use rolling window analysis to smooth out short-term fluctuations and reveal trends in metrics such as sales, profit, quantity, and discount by calculating their rolling averages over a set time window.
Examine quarterly sales data, group by quarters, and create visualizations to reveal seasonal patterns and fluctuations in Sample EU Superstore’s performance.
Analyze the profit margin for each product.
Analyze which products or categories respond better to discounts.
What are the top categories and sub-categories in terms of sales and profit?
Are there any specific products that contribute significantly to profit?
What is the distribution of sales, quantity, discount, and profit?
How are sales and profit distributed across different regions, categories, and sub-categories?
Which region has the highest sales?
How do profit and sales vary across different regions?
How does the average sales vary across different ship modes?
Are there any products that can consistently contribute to high profits?
How does the application of discounts impact the sales performance of different products or categories?
How does the profit margin vary across different products?
summary_df <- summary(Sample_EU_Superstore[c("Sales", "Quantity", "Discount", "Profit")])
summary_df <- as.data.frame(as.table(summary_df))
colnames(summary_df) <- c("Variable", "Statistic", "Value")
# Replace 'Variable' column values with the actual variable names
summary_df$Variable <- c("Sales", "Quantity", "Profit", "Discount")
# Assuming your summary output looks something like this
summary_df <- summary(Sample_EU_Superstore[c("Sales", "Quantity", "Discount", "Profit")])
summary_df <- as.data.frame(as.table(summary_df))
colnames(summary_df) <- c("Variable", "Statistic", "Value")
# Replace 'Variable' column values with the actual variable names
summary_df$Variable <- c("Sales", "Quantity", "Profit", "Discount")
# Region-wise analysis
region_sales <- Sample_EU_Superstore %>%
group_by(Region) %>%
summarise(total_sales = sum(Sales))
as.data.frame(print(region_sales))
## # A tibble: 3 × 2
## Region total_sales
## <chr> <dbl>
## 1 Central 1720553.
## 2 North 625575.
## 3 South 591961.
## Region total_sales
## 1 Central 1720552.6
## 2 North 625575.0
## 3 South 591961.5
# Bar plot for sales across regions with Plotly
region_sales_plot<- ggplot(region_sales, aes(x = Region, y = total_sales, fill = Region)) +
geom_bar(stat = "identity") +
labs(title = "Total Sales Across Regions", x = "Region", y = "Total Sales") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert ggplot to plotly
region_plot<-ggplotly(region_sales_plot)
region_plot
#Proportions
region_sales_prop <- Sample_EU_Superstore %>%
group_by(Region) %>%
summarise(total_sales = sum(Sales))
# Calculate the percentage contribution
region_sales_prop$Percentage <- (region_sales$total_sales / sum(region_sales$total_sales)) * 100
# Print the results
print(region_sales_prop)
## # A tibble: 3 × 3
## Region total_sales Percentage
## <chr> <dbl> <dbl>
## 1 Central 1720553. 58.6
## 2 North 625575. 21.3
## 3 South 591961. 20.1
# Stacked bar plot for percentage contribution of sales by region with Plotly
region_plot <- ggplotly(
ggplot(region_sales_prop, aes(x = Region, y = Percentage, fill = Region)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Percentage Contribution of Sales by Region",
x = "Region",
y = "Percentage (%)") +
theme_minimal()
)
region_plot
# Category-wise analysis
category_sales <- Sample_EU_Superstore %>%
group_by(Category) %>%
summarise(total_sales = sum(Sales))
print(category_sales)
## # A tibble: 3 × 2
## Category total_sales
## <chr> <dbl>
## 1 Furniture 779103.
## 2 Office Supplies 1046307.
## 3 Technology 1112679.
# Convert total_sales to decimal format
category_sales$total_sales <- format(category_sales$total_sales, scientific = FALSE)
# Bar plot for sales across categories
category_sales_plot <- ggplot(category_sales, aes(x = Category, y = total_sales, fill = Category)) +
geom_bar(stat = "identity") +
labs(title = "Total Sales Across Categories", x = "Category", y = "Total Sales") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert ggplot to plotly
category_sales_plotly <- ggplotly(category_sales_plot)
# Display the plotly object
category_sales_plotly
## Sub-category-wise analysis
sub_category_profit <- Sample_EU_Superstore %>%
group_by(`Sub-Category`) %>%
summarise(total_profit = sum(Profit))
print(sub_category_profit)
## # A tibble: 17 × 2
## `Sub-Category` total_profit
## <chr> <dbl>
## 1 Accessories 33442.
## 2 Appliances 46337.
## 3 Art 30794.
## 4 Binders 19760.
## 5 Bookcases 56407.
## 6 Chairs 19792.
## 7 Copiers 56200.
## 8 Envelopes 8640.
## 9 Fasteners 4428.
## 10 Furnishings 13650.
## 11 Labels 3289.
## 12 Machines 17287.
## 13 Paper 8884.
## 14 Phones 37344.
## 15 Storage 27950.
## 16 Supplies 9625.
## 17 Tables -20998.
sub_category_profit_plot <- ggplot(sub_category_profit, aes(x = `Sub-Category`, y = total_profit, fill = `Sub-Category`)) +
geom_bar(stat = "identity") +
labs(title = "Total Profit Across Sub-categories", x = "Sub-category", y = "Total Profit") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
sub_category_profit_plotly <- ggplotly(sub_category_profit_plot)
sub_category_profit_plotly
# Convert Order Date to Date format
Sample_EU_Superstore$`Order Date` <- as.Date(Sample_EU_Superstore$`Order Date`)
# Extract quarter from Order Date
Sample_EU_Superstore$Quarter <- quarter(Sample_EU_Superstore$`Order Date`, with_year = TRUE)
# Aggregate Quarterly Sales by Segments
quarterly_sales <- Sample_EU_Superstore %>%
group_by(Segment, Quarter) %>%
summarise(total_sales = sum(Sales))
## `summarise()` has grouped output by 'Segment'. You can override using the
## `.groups` argument.
print(quarterly_sales)
## # A tibble: 48 × 3
## # Groups: Segment [3]
## Segment Quarter total_sales
## <chr> <dbl> <dbl>
## 1 Consumer 2015. 24033.
## 2 Consumer 2015. 55360.
## 3 Consumer 2015. 92606.
## 4 Consumer 2015. 84076.
## 5 Consumer 2016. 64388.
## 6 Consumer 2016. 87087.
## 7 Consumer 2016. 125014.
## 8 Consumer 2016. 98498.
## 9 Consumer 2017. 74263.
## 10 Consumer 2017. 94378.
## # ℹ 38 more rows
# Plot Quarterly Sales by Segments
quarterly_sales_plot <- ggplot(quarterly_sales, aes(x = Quarter, y = total_sales, color = Segment)) +
geom_line(size = 1.5) +
geom_point(size = 3) +
labs(title = "Quarterly Sales by Segments",
x = "Quarter",
y = "Total Sales",
color = "Segments") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the interactive plot using plotly
ggplotly(quarterly_sales_plot)
# Convert Order Date to Date format
Sample_EU_Superstore$`Order Date` <- as.Date(Sample_EU_Superstore$`Order Date`)
# Define a function to compute the desired metric (e.g., rolling average) within the rolling window
rolling_window_function <- function(data) {
# Replace 'Sales' with the desired metric (Sales, Profit, Quantity, Discount)
result <- mean(data, na.rm = TRUE)
return(result)
}
# Set the size of the rolling window (e.g., 3 months)
window_size <- 3
# Apply rolling window analysis to the entire dataset
rolling_window_results <- Sample_EU_Superstore %>%
arrange(`Order Date`) %>%
group_by(`Order Date`) %>%
summarise(Sales = sum(Sales)) %>%
mutate(rolling_sales = zoo::rollapply(Sales, window_size, rolling_window_function, align = "right", fill = NA, partial = TRUE))
print(rolling_window_results) # Display the results
## # A tibble: 1,253 × 3
## `Order Date` Sales rolling_sales
## <date> <dbl> <dbl>
## 1 2015-01-01 44.9 44.9
## 2 2015-01-03 854. 450.
## 3 2015-01-04 183. 361.
## 4 2015-01-05 297. 445.
## 5 2015-01-06 324. 268.
## 6 2015-01-07 910. 510.
## 7 2015-01-08 1103. 779.
## 8 2015-01-10 206. 739.
## 9 2015-01-11 1969. 1093.
## 10 2015-01-12 1671. 1282.
## # ℹ 1,243 more rows
print(rolling_window_results)
## # A tibble: 1,253 × 3
## `Order Date` Sales rolling_sales
## <date> <dbl> <dbl>
## 1 2015-01-01 44.9 44.9
## 2 2015-01-03 854. 450.
## 3 2015-01-04 183. 361.
## 4 2015-01-05 297. 445.
## 5 2015-01-06 324. 268.
## 6 2015-01-07 910. 510.
## 7 2015-01-08 1103. 779.
## 8 2015-01-10 206. 739.
## 9 2015-01-11 1969. 1093.
## 10 2015-01-12 1671. 1282.
## # ℹ 1,243 more rows
# Line chart for rolling sales with enhanced readability
plot <- ggplot(rolling_window_results, aes(x = `Order Date`, y = rolling_sales)) +
geom_line(color = "lightblue", linewidth = 1, linetype = "solid") +
geom_smooth(method = "loess", se = FALSE, color = "green", linetype = "dashed") +
labs(title = "Rolling Window Analysis - Sales Over Time",
x = "Order Date",
y = "Rolling Sales",
caption = "Dashed line represents a smoothed trend") +
theme_minimal() +
theme(legend.position = "none") +
annotate("text", x = max(rolling_window_results$`Order Date`), y = max(rolling_window_results$rolling_sales),
label = "Smoothed Trend", hjust = 1.2, vjust = 0.5, color = "orange") +
geom_point(size = 2, color = "pink")
rolling_window_plot <- ggplotly(plot); rolling_window_plot
## `geom_smooth()` using formula = 'y ~ x'
# Convert 'Order Date' to a Date type
Sample_EU_Superstore$`Order Date` <- as.Date(Sample_EU_Superstore$`Order Date`)
# Extract year from 'Order Date'
Sample_EU_Superstore <- Sample_EU_Superstore %>%
mutate(Year = year(`Order Date`))
# Group by year and calculate total sales and profit
yearly_summary <- Sample_EU_Superstore %>%
group_by(Year) %>%
summarise(TotalSales = sum(Sales),
TotalProfit = sum(Profit))
print(yearly_summary)
## # A tibble: 4 × 3
## Year TotalSales TotalProfit
## <dbl> <dbl> <dbl>
## 1 2015 477797. 61376.
## 2 2016 652647. 84025.
## 3 2017 765441. 98484.
## 4 2018 1042204. 128944.
# Visualize the results with a line chart
ggplotly(ggplot(yearly_summary, aes(x = factor(Year), y = TotalSales, group = 1)) +
geom_line(aes(color = "Total Sales"), size = 1) +
geom_line(aes(y = TotalProfit, color = "Total Profit"), size = 1) +
ggtitle("Yearly Sales and Profit") +
xlab("Year") +
ylab("Amount") +
scale_color_manual(values = c("Total Sales" = "#4e79a7", "Total Profit" = "#e15759")) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12),
legend.title = element_blank(),
legend.text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white"),
legend.position = "top"
)
)
###Product Analysis
# Group by product and calculate total sales, profit, and profit margin
product_summary <- Sample_EU_Superstore %>%
group_by(`Product ID`, `Product Name`) %>%
summarise(TotalSales = sum(Sales),
TotalProfit = sum(Profit),
ProfitMargin = sum(Profit) / sum(Sales) * 100) %>%
arrange(desc(TotalProfit))
## `summarise()` has grouped output by 'Product ID'. You can override using the
## `.groups` argument.
# Select the top N products (adjust N as needed)
top_products <- head(product_summary, 10)
# Visualize the results with rotated x-axis labels
ggplotly(ggplot(top_products, aes(x = reorder(`Product Name`, -TotalProfit), y = TotalProfit)) +
geom_bar(stat = "identity",fill = "yellow") +
ggtitle("Top 10 Products by Total Profit") +
xlab("Product Name") +
ylab("Total Profit") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate labels for better readability
)
#Discount Analysis
# Assuming you have a column 'Discount' in your dataset
discount_analysis <- Sample_EU_Superstore %>%
group_by(Category, `Product Name`, Discount) %>%
summarise(AvgSales = mean(Sales))
## `summarise()` has grouped output by 'Category', 'Product Name'. You can
## override using the `.groups` argument.
discount_analysis
## # A tibble: 3,819 × 4
## # Groups: Category, Product Name [1,856]
## Category `Product Name` Discount AvgSales
## <chr> <chr> <dbl> <dbl>
## 1 Furniture Advantus Clock, Black 0 132.
## 2 Furniture Advantus Clock, Duo Pack 0 175.
## 3 Furniture Advantus Clock, Duo Pack 0.3 144.
## 4 Furniture Advantus Clock, Durable 0 242.
## 5 Furniture Advantus Clock, Durable 0.1 218.
## 6 Furniture Advantus Clock, Durable 0.3 67.8
## 7 Furniture Advantus Clock, Erganomic 0 100.
## 8 Furniture Advantus Clock, Erganomic 0.3 140.
## 9 Furniture Advantus Door Stop, Black 0 158.
## 10 Furniture Advantus Door Stop, Black 0.4 27.0
## # ℹ 3,809 more rows
# Visualize the results
plot <- ggplot(discount_analysis, aes(x = Discount, y = AvgSales, fill = Category)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Average Sales with Different Discounts") +
xlab("Discount") +
ylab("Average Sales") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Use ggplotly to convert to interactive plotly plot
plotly_plot <- ggplotly(plot)
plotly_plot
#product analysis
# Calculate total profit for each product
product_profit <- Sample_EU_Superstore %>%
group_by(`Product ID`, `Product Name`) %>%
summarize(TotalProfit = sum(Profit))
## `summarise()` has grouped output by 'Product ID'. You can override using the
## `.groups` argument.
# Identify top products with consistently high profits
top_profitable_products <- product_profit %>%
filter(TotalProfit > quantile(TotalProfit, 0.75)) # Adjust the quantile threshold as needed
top_profitable_products
## # A tibble: 23 × 3
## # Groups: Product ID [23]
## `Product ID` `Product Name` TotalProfit
## <chr> <chr> <dbl>
## 1 FUR-CH-10002212 Hon Chairmat, Black -52.7
## 2 FUR-CH-10003249 Novimex Chairmat, Adjustable 316.
## 3 FUR-FU-10000295 Advantus Light Bulb, Durable 57.1
## 4 FUR-FU-10000361 Eldon Stacking Tray, Duo Pack 24.6
## 5 FUR-FU-10001252 Eldon Frame, Duo Pack 419.
## 6 OFF-AP-10003577 Hoover Stove, Silver 364.
## 7 OFF-AP-10003758 KitchenAid Blender, Silver 326.
## 8 OFF-AR-10003651 Sanford Pencil Sharpener, Easy-Erase 1028.
## 9 OFF-BI-10002570 Acco Binder Covers, Clear 243.
## 10 OFF-BI-10003708 Avery Binder, Economy 162.
## # ℹ 13 more rows
# Visualize top profitable products
ggplotly(ggplot(top_profitable_products, aes(x = reorder(`Product Name`, -TotalProfit), y = TotalProfit)) +
geom_bar(stat = "identity", fill = "pink") +
labs(title = "Top Profitable Products",
x = "Product Name",
y = "Total Profit") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
###ship mode analysis:
# Group the data by ship mode and calculate the average sales for each mode
ship_mode_sales <- Sample_EU_Superstore %>%
group_by(`Ship Mode`) %>%
summarise(AverageSales = mean(Sales, na.rm = TRUE))
# Print the result
print(ship_mode_sales)
## # A tibble: 4 × 2
## `Ship Mode` AverageSales
## <chr> <dbl>
## 1 First Class 284.
## 2 Same Day 319.
## 3 Second Class 304.
## 4 Standard Class 291.
# Create a more attractive bar plot
ggplotly(ggplot(ship_mode_sales, aes(x = reorder(`Ship Mode`, -AverageSales), y = AverageSales, fill = `Ship Mode`)) +
geom_bar(stat = "identity", size = 0.7) +
labs(title = "Average Sales Across Different Ship Modes",
x = "Ship Mode",
y = "Average Sales") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position="none") +
scale_fill_brewer(palette = "Set3") # You can choose a different color palette
)